The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA.
Columns are as follow:
We need to provide information to help with making an informed decision by answering the following questions:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
boston_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
boston_df=pd.read_csv(boston_url)
boston_df.head()
| Unnamed: 0 | CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 4.98 | 24.0 |
| 1 | 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 9.14 | 21.6 |
| 2 | 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 4.03 | 34.7 |
| 3 | 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 2.94 | 33.4 |
| 4 | 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 5.33 | 36.2 |
boston_df['RM'] = boston_df['RM'].round() #It doesn't make sense if number of room is fraction so we'll round it
boston_df.head()
| Unnamed: 0 | CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 7.0 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 4.98 | 24.0 |
| 1 | 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.0 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 9.14 | 21.6 |
| 2 | 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.0 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 4.03 | 34.7 |
| 3 | 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 7.0 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 2.94 | 33.4 |
| 4 | 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.0 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 5.33 | 36.2 |
#We need to categorize AGE into 3 groups for the test
boston_df.loc[(boston_df['AGE'] <= 35), 'age_group'] = '35 years and younger'
boston_df.loc[(boston_df['AGE'] > 35) & (boston_df['AGE'] < 70), 'age_group'] = 'between 35 and 70 years'
boston_df.loc[(boston_df['AGE'] >= 70), 'age_group'] = '70 years and older'
boston_df.loc[(boston_df['CHAS']==1),'CHAS_'] = 'YES'
boston_df.loc[(boston_df['CHAS']==0),'CHAS_'] = 'NO'
boston_df.isnull().sum() #CHECK FOR NAN VALES
Unnamed: 0 0 CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 LSTAT 0 MEDV 0 age_group 0 CHAS_ 0 dtype: int64
fig = px.histogram(boston_df, x="PTRATIO", title= "Histogram of PTRATIO variable")
fig.show()
138/len(boston_df['PTRATIO'])
0.2727272727272727
fig = px.histogram(boston_df, x="CHAS_", title='Counts houses bounded by Charles river')
fig.show()
fig = px.scatter(boston_df, x="RM", y="TAX", title="Average number of rooms Vs TAX")
fig.show()
#boston_df.query("TAX >= 666")
#boston_df.query("TAX >= 666")['MEDV'].hist(bins=15)
fig = px.box(boston_df, y="MEDV", title ='Boxplot for Median value of owner-occupied homes')
fig.show()
fig = px.box(boston_df, x="age_group", y="MEDV")
fig.show()
fig = px.scatter(boston_df, x="NOX", y="INDUS", title="Nitric oxide concentrations Vs the proportion of non-retail business acres per town")
fig.show()
fig = ff.create_distplot([boston_df['MEDV']], ['distplot'])
fig.update_layout(title_text='MEDV variable Distribution')
fig.show()
# This function does the replacing part
def replace_outliers(data, col):
for x in [col]:
q75,q25 = np.percentile(data.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
data.loc[data[x] < min,x] = np.nan
data.loc[data[x] > max,x] = np.nan
replace_outliers(boston_df, "MEDV")
boston_df.isnull().sum()
Unnamed: 0 0 CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 LSTAT 0 MEDV 40 age_group 0 CHAS_ 0 dtype: int64
boston_df = boston_df.dropna(axis = 0)
boston_df.isnull().sum()
Unnamed: 0 0 CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 LSTAT 0 MEDV 0 age_group 0 CHAS_ 0 dtype: int64
first_group = pd.DataFrame(boston_df[boston_df['age_group'] == '35 years and younger']['MEDV'])
second_group = pd.DataFrame( boston_df[boston_df['age_group'] == 'between 35 and 70 years']['MEDV'])
third_group = pd.DataFrame( boston_df[boston_df['age_group'] == '70 years and older']['MEDV'])
#Replace outliers with nan values then remove them
replace_outliers(second_group, "MEDV")
replace_outliers(third_group, "MEDV")
second_group = second_group.dropna(axis = 0)
third_group = third_group.dropna(axis = 0)
boston_df.isnull().sum()
Unnamed: 0 0 CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 LSTAT 0 MEDV 0 age_group 0 CHAS_ 0 dtype: int64
The Following assumption must be met:
Hypotheses
scipy.stats.levene(boston_df[boston_df['CHAS_'] == 'YES']['MEDV'],
boston_df[boston_df['CHAS_'] == 'NO']['MEDV'], center='mean')
#P-value > 0.05, and we can apply T-test now
LeveneResult(statistic=0.5851832979054116, pvalue=0.4446766650538493)
scipy.stats.ttest_ind(boston_df[boston_df['CHAS_'] == 'YES']['MEDV'],
boston_df[boston_df['CHAS_'] == 'NO']['MEDV'], equal_var = True)
Ttest_indResult(statistic=2.100509858104912, pvalue=0.03622310352018016)
Conslusion: P-value < 0.05, then we can reject H_0, As there's significant difference between houses bounded by Charles river and houses which not
Hypotheses
Levene test for equality of variance
scipy.stats.levene(first_group['MEDV'], second_group['MEDV'], third_group['MEDV'], center='mean')
# since the p-value > 0.05, the variance are equal
LeveneResult(statistic=21.014874779508904, pvalue=1.912481887995644e-09)
f_statistic, p_value = scipy.stats.f_oneway(first_group, second_group, third_group)
print("F_Statistic: {0}, P-Value: {1}".format(f_statistic,p_value))
F_Statistic: [98.19383233], P-Value: [5.18382061e-36]
Conclusion:
Hypotheses
Since they are both continuous variables we can use a pearson correlation test and draw a scatter plot
scipy.stats.pearsonr(boston_df['NOX'], boston_df['INDUS'])
(0.766019356646604, 4.3275691279205567e-91)
Conclusion: Since the p-value (Sig. (2-tailed) < 0.05, we reject the Null hypothesis and conclude that there a relationship between NOX and INDUS
X = boston_df['DIS']
y = boston_df['MEDV']
## add an intercept (beta_0) to our model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
# Print out the statistics
model.summary()
C:\Users\fatma\anaconda3\lib\site-packages\statsmodels\tsa\tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
| Dep. Variable: | MEDV | R-squared: | 0.193 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.192 |
| Method: | Least Squares | F-statistic: | 111.3 |
| Date: | Fri, 11 Nov 2022 | Prob (F-statistic): | 1.81e-23 |
| Time: | 23:44:25 | Log-Likelihood: | -1479.4 |
| No. Observations: | 466 | AIC: | 2963. |
| Df Residuals: | 464 | BIC: | 2971. |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 15.5997 | 0.555 | 28.124 | 0.000 | 14.510 | 16.690 |
| DIS | 1.3354 | 0.127 | 10.550 | 0.000 | 1.087 | 1.584 |
| Omnibus: | 17.196 | Durbin-Watson: | 0.631 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 18.287 |
| Skew: | 0.484 | Prob(JB): | 0.000107 |
| Kurtosis: | 3.071 | Cond. No. | 9.41 |
Conclusion: The addational one unit of DIS cause an increase in MEDV by 1.3
fig = px.scatter(boston_df, x="DIS", y="MEDV", trendline="ols")
fig.show()